import pyarrow as pa
import pyarrow.parquet as pq
# import fiona
from math import radians, cos, sin, asin, sqrt
import pandas as pd
import numpy as np
import os
from engarde.decorators import none_missing, unique_index, is_shape
import engarde.generic
import engarde.decorators as ed
from pyproj import Geod
# for plotting map Leaflet library
import folium
from folium import plugins
import scipy
from itertools import cycle
import math
from scipy.spatial import distance
import timeit
import time
import datetime
import sys
from IPython.display import display
import matplotlib.colors as colorscale
#for plotting barcharts and histograms and line chart
import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from scipy import signal
import scipy.stats as stats
import pylab as pl
import plotly.plotly as py
import plotly.graph_objs as go
import geojson as jj
from geojson import Feature, Point, FeatureCollection
import colorsys
import random
from pykalman import KalmanFilter
from scipy import stats
plt.rcParams['figure.figsize'] = 13,8
import matplotlib.ticker as mtick
import warnings
warnings.filterwarnings('ignore')
# warnings.filterwarnings(action='once')
import plotly as pyPlot
pyPlot.tools.set_credentials_file(username='ladhar.ravishankar', api_key='uzbvUCS23VktDu0VC3l0')
pyPlot.tools.set_config_file(world_readable=True, sharing='public')
import json
#start_time = timeit.default_timer()
result_frame = pq.read_table('/bigdata0/rshankar/Zone18_2011/02/Broadcast.parquet').to_pandas()
result_frame = result_frame.sort_values(['MMSI', 'BaseDateTime'])
result_frame['CheckMMSI'] = (result_frame['MMSI'] != result_frame['MMSI'].shift(1))
#shifting the whole dataframe rows and calculating radians latitude
result_frame['latRad'] = np.where(result_frame['CheckMMSI'] != True , result_frame['lat'].shift(1).apply(radians), np.nan)
result_frame['lonRad'] = np.where(result_frame['CheckMMSI'] != True , result_frame['lon'].shift(1).apply(radians), np.nan)
#taking radinas for the small values
lat1 = result_frame['lat'].apply(radians)
lon1 = result_frame['lon'].apply(radians)
#taking difference for latitude and longitude
dlon = result_frame['lonRad'] - lon1
dlat = result_frame['latRad'] - lat1
#calculating te distance
harvesineDist = (dlat/2).apply(sin)**2 + lat1.apply(cos) * result_frame['latRad'].apply(cos) * (dlon/2).apply(sin)**2
c = 2* np.arcsin(harvesineDist.apply(sqrt))
dist = 6367 * c
# c = 2 * (harvesineDist.apply(sqrt).apply(asin))
# dist = * c
##--- Creating a new Data frame for Distance
list_dist = pd.DataFrame({"Distance": dist})
#concatinating the Dataframe with the old Dataframe
result_frame = pd.concat([result_frame, list_dist], axis=1)
result_frame
#chaning the datetime from string to datetime stamp
result_frame['DateTime'] = pd.DatetimeIndex(result_frame['BaseDateTime'])
#df_Broadcast['DateTime']
#shfting the date on bool value and taking out the difference of two dates
result_frame['DateTime2'] = np.where(result_frame['CheckMMSI'] != True , result_frame['DateTime'].shift(1), result_frame['DateTime'])
result_frame['time_diff'] = (result_frame['DateTime'] - result_frame['DateTime2'])
#calculating the total hours from baseDateTime
time = pd.DatetimeIndex(result_frame['time_diff'])
result_frame['totalHours'] = time.hour + (time.minute/60) + (time.second/3600)
result_frame= result_frame.reset_index()
#df_Broadcast
del result_frame['latRad']
del result_frame['lonRad']
del result_frame['DateTime2']
#del result_frame['time_diff']
del result_frame['CheckMMSI']
result_frame['Speed'] = result_frame['Distance'] / result_frame['totalHours']
result_frame = result_frame.replace([np.inf, -np.inf], np.nan)
result_frame= result_frame.replace([np.inf, -np.inf], np.nan).dropna(subset=["Speed"], how="all")
#print (timeit.default_timer() - start_time)
df = result_frame
df1 = df.set_index(pd.DatetimeIndex(df.DateTime))
variable1 = df1.groupby(['VoyageID']).rolling("5T").Speed.mean().reset_index(name= "rolling_mean").rename(columns={"VoyageID": "VoyageID2", "DateTime": "DT2"})
df = df.sort_values(['VoyageID', 'DateTime'], ascending=True).reset_index()
df = pd.concat([df, variable1], axis=1)
df_test = df
statinfo = os.stat('/bigdata0/rshankar/Zone18_2011/02/Broadcast.parquet')
statinfo
df_test['Stopped']= df_test.groupby('VoyageID').apply(lambda x: x.rolling_mean.isnull()).values | df_test.groupby('VoyageID').apply(lambda x: x.rolling_mean >= 0.5).values
df_test['subVoyageIDs']= (df_test.groupby('VoyageID').apply(lambda x:x.Stopped.shift(1)) != df_test.groupby('VoyageID').apply(lambda x:x.Stopped) | (df_test.groupby('VoyageID').apply(lambda x: x.rolling_mean <= 0.5).values)).astype(int).cumsum().values
df_testResult= df_test[df_test['Stopped'] == True]
dataPoints = df_testResult.set_index(pd.DatetimeIndex(df_testResult.DateTime))
dataPoints= dataPoints.rename(columns={"index": "indexCol"})
df_CenteredWin = dataPoints
datapoints= df_CenteredWin.groupby('subVoyageIDs').agg({'Speed':'count'}).reset_index().rename(columns ={'Speed':'totalPoints'})
datapoints = datapoints[datapoints.totalPoints < 10]
df_CenteredWin = df_CenteredWin[~(df_CenteredWin.subVoyageIDs.isin(datapoints.subVoyageIDs.unique()))]
df_CenteredWin['noise_timegaps']= df_CenteredWin.groupby('subVoyageIDs').apply(lambda x: (x.DateTime.diff().dt.seconds/3600 > 1)).values
df_CenteredWin['sub_subVoyageIDs']= (df_CenteredWin.groupby('subVoyageIDs').apply(lambda x:x.noise_timegaps.shift(1)) != df_CenteredWin.groupby('subVoyageIDs').apply(lambda x:x.noise_timegaps) | (df_CenteredWin.groupby('subVoyageIDs').apply(lambda x: x.DateTime.diff().dt.seconds/3600 > 1).values)).astype(int).cumsum().values
total_Points = df_CenteredWin.groupby('sub_subVoyageIDs').agg({'Speed':'count'}).reset_index().rename(columns={'Speed':'totalPoints'})
total_Points = total_Points[total_Points.totalPoints > 10]
df_CenteredWin = df_CenteredWin[df_CenteredWin.sub_subVoyageIDs.isin(total_Points.sub_subVoyageIDs.unique())]
dfdata = pd.DataFrame(data = { 'MMSI': df_CenteredWin.MMSI,
'VoyageID' : df_CenteredWin.VoyageID,
'subVoyageIDs' : df_CenteredWin.subVoyageIDs,
'indexCol' : df_CenteredWin.indexCol,
'subVoyageIDs_subV' : df_CenteredWin.sub_subVoyageIDs,
'indexCol': df_CenteredWin.indexCol,
'DateTime' : df_CenteredWin.DateTime,
'distance' : df_CenteredWin.Distance,
'Speed' : df_CenteredWin.Speed,
'totalHours' : df_CenteredWin.totalHours,
'lat' : df_CenteredWin.lat,
'lon' : df_CenteredWin.lon})
def centerMeanWin(data):
max_timestamp= pd.Timestamp(data.DateTime.max())
min_timestamp= pd.Timestamp(data.DateTime.min())
period = '60T'
reverse_df = data.set_index(max_timestamp - (data.index - min_timestamp)).sort_index()
win_left = reverse_df.Speed.rolling(period, closed='left').sum()
win_left= win_left.reset_index()
win_left= win_left.set_index(pd.DatetimeIndex(win_left.DateTime))
count_Left= win_left.Speed.rolling(period).count()
win_left = win_left.set_index(min_timestamp + (max_timestamp - win_left.index))
win_right = data.Speed.rolling(period, closed='right').sum()
win_right= win_right.reset_index()
win_right = win_right.set_index(pd.DatetimeIndex(win_right.DateTime))
count_right= win_right.Speed.rolling(period).count()
centeredMeanwin = (win_left.Speed.fillna(0) + win_right.Speed.fillna(0))/(count_Left.values + count_right.values)
return centeredMeanwin
def centerSTDWin(data):
max_timestamp= pd.Timestamp(data.DateTime.max())
min_timestamp= pd.Timestamp(data.DateTime.min())
period = '60T'
reverse_df = data.set_index(max_timestamp - (data.index - min_timestamp)).sort_index()
win_left = reverse_df.Speed.rolling(period, closed='left').sum()
win_left= win_left.reset_index()
win_left= win_left.set_index(pd.DatetimeIndex(win_left.DateTime))
count_Left= win_left.Speed.rolling(period).count()
win_left = win_left.set_index(min_timestamp + (max_timestamp - win_left.index))
win_right = data.Speed.rolling(period, closed='right').sum()
win_right= win_right.reset_index()
win_right = win_right.set_index(pd.DatetimeIndex(win_right.DateTime))
count_right= win_right.Speed.rolling(period).count()
#------------- Claculation of the centered Mean value -------------#
centeredMeanwin = (win_left.Speed.fillna(0) + win_right.Speed.fillna(0))/(count_Left.values + count_right.values)
#------------- End of the Claculation of the centered Mean value -------------#
# after taking centred mean, take the centered Standard deviation from that mean values
#------------- Starts with the standard deviation formula -------------#
squareVal = np.square(data.Speed - centeredMeanwin)
sum_SquareVal = squareVal.rolling(period).sum()
count_SquareVal = squareVal.rolling(period).count()
centeredSTD = np.sqrt(sum_SquareVal / count_SquareVal)
#------------- Ends the standard deviation formula -------------#
return centeredSTD
dfdata['centeredMean']= dfdata.groupby('subVoyageIDs_subV').apply(lambda x: centerMeanWin(x)).values
dfdata['centeredSTD']= dfdata.groupby('subVoyageIDs_subV').apply(lambda x: centerSTDWin(x)).values
dfdata['centeredZScore'] = (dfdata.Speed - dfdata.centeredMean) / (dfdata.centeredSTD)
dfdata['zScore_Noise'] = np.where(dfdata['centeredZScore'] >= 1.0, 'noise', 'noisefree')
dfdata['zScore_Noise'] = np.where((dfdata.lon.between(-72.011111, -72.000000)), 'outofzone', dfdata['zScore_Noise'])
dfdata['zScore_Noise'] = np.where((dfdata.lon.between(-77.999999,-77.999000)), 'outofzone_End', dfdata['zScore_Noise'])
len(dfdata[dfdata['zScore_Noise'] == 'outofzone'])
len(dfdata[dfdata['zScore_Noise'] == 'outofzone_End'])
# dfdata= dfdata[~(dfdata['zScore_Noise'] == 'outofzone_End') | (dfdata['zScore_Noise'] == 'outofzone')]
len(dfdata[dfdata['zScore_Noise'] == 'noise'])
dfdata['lat1'] = dfdata.groupby('subVoyageIDs_subV').apply(lambda x: x.lat.shift(1)).values
dfdata['lat_1'] = dfdata.groupby('subVoyageIDs_subV').apply(lambda x: x.lat.shift(-1)).values
dfdata['lon1'] = dfdata.groupby('subVoyageIDs_subV').apply(lambda x: x.lon.shift(1)).values
dfdata['lon_1'] = dfdata.groupby('subVoyageIDs_subV').apply(lambda x: x.lon.shift(-1)).values
# finding angle between three points
def getAngle(data):
x1 = data.lat1.values
x2 = data.lat.values
x3 = data.lat_1.values
y1 = data.lon1.values
y2 = data.lon.values
y3 = data.lon_1.values
a = np.array(list(zip(x1, y1)))
b = np.array(list(zip(x2, y2)))
c = np.array(list(zip(x3, y3)))
ba = a - b
bc = c - b
dotPro= [np.dot(a,b) for a,b in zip(bc, ba)]
normA = [np.linalg.norm(a) for a in ba]
normB = [np.linalg.norm(b) for b in bc]
cosine_angle = [m/(n*o) for m, n, o in zip(dotPro, normA, normB)]
angle = np.arccos(cosine_angle)
angles= np.degrees(angle)
return angles
dfdata['angle'] = getAngle(dfdata)
df_data = dfdata
fig, ax = plt.subplots(figsize=(16,10))
ax.scatter(dfdata.angle, dfdata['distance'])
ax.set_xlabel('angle')
ax.set_ylabel('distance')
plt.show()
maxBounding = dfdata.groupby('subVoyageIDs_subV').agg({'lat':'max', 'lon':'max'}).reset_index()
minBounding = dfdata.groupby('subVoyageIDs_subV').agg({'lat':'min', 'lon':'min'}).reset_index()
boundingBox = maxBounding.merge(minBounding, on='subVoyageIDs_subV')
boundingBox = boundingBox.rename(columns={'lat_x':'lat_max', 'lon_x':'lon_max', 'lat_y':'lat_min', 'lon_y':'lon_min'})
boundingBox['distance'] = np.sqrt(np.square(boundingBox.lat_max - boundingBox.lat_min) + np.square(boundingBox.lon_max - boundingBox.lon_min)).values
# boundingBox[boundingBox['subVoyageIDs_subV'] == 44]
# boundingBox[boundingBox.distance == 8.921442425626267]
boundingBoxSubV= boundingBox[boundingBox.distance < 0.09]
len(boundingBoxSubV.subVoyageIDs_subV.unique())
boundingBoxData= dfdata[(dfdata.subVoyageIDs_subV.isin(boundingBoxSubV.subVoyageIDs_subV.unique()))]
bbUnique = boundingBoxData.subVoyageIDs_subV.unique()
for idx, subVV in enumerate(falsesubV[10:60]):
display(subVV)
subV_short = shortsubV[shortsubV['subVoyageIDs'] == subVV]
plt.plot(subV_short.lon, subV_short.lat)
plt.show()
# subV = shortsubV[shortsubV['subVoyageIDs_subV'] == 10647965]
# coordinates = list(zip(subV.lat.tolist(), subV.lon.tolist()))
# m = folium.Map(location=[41.8240, -71.4128])
# folium.PolyLine(coordinates, color = "Black").add_to(m)
# m
dfdata= dfdata[~(dfdata.subVoyageIDs_subV.isin(boundingBoxSubV.subVoyageIDs_subV.unique()))]
dataNoise = dfdata[dfdata['zScore_Noise'] == 'noise']
len(dataNoise)
# distanceNoise = dataNoise[dataNoise.distance.between(100, 200)]
distAngleNoise = dataNoise[dataNoise.angle.between(0 , 100)]
len(distAngleNoise)
LdistanceNoise = dataNoise[dataNoise.distance.between(10, dataNoise.distance.max())]
LdistAngleNoise = LdistanceNoise[LdistanceNoise.angle.between(0 , 180)]
len(LdistAngleNoise)
largeNoise = dataNoise[(np.isnan(dataNoise.angle) == True) & (dataNoise.Speed > 100) & (dataNoise.distance > 10)]
len(largeNoise)
dfnoise = dfdata[((dfdata.Speed > 100) & (dfdata.distance > 100)) & (np.isnan(dfdata.angle) == True)]
len(dfnoise)
rawData= dfdata[(dfdata.subVoyageIDs_subV.isin(distAngleNoise.subVoyageIDs_subV.unique())) | (dfdata.subVoyageIDs_subV.isin(LdistAngleNoise.subVoyageIDs_subV.unique())) | (dfdata.subVoyageIDs_subV.isin(largeNoise.subVoyageIDs_subV.unique())) | (dfdata.subVoyageIDs_subV.isin(dfnoise.subVoyageIDs_subV.unique())) ]
display(len(rawData))
display(len(rawData.subVoyageIDs_subV.unique()))
totalNoise = rawData[(rawData.indexCol.isin(distAngleNoise.indexCol)) | (rawData.indexCol.isin(LdistAngleNoise.indexCol)) | (rawData.indexCol.isin(largeNoise.indexCol)) | (rawData.indexCol.isin(dfnoise.indexCol))]
cleanedData= rawData[~(rawData.indexCol.isin(totalNoise.indexCol))]
display(len(cleanedData))
display(len(cleanedData.subVoyageIDs_subV.unique()))
df_cleaned= dfdata[dfdata.indexCol.isin(cleanedData.indexCol)]
del df_cleaned['lat1']
del df_cleaned['lat_1']
del df_cleaned['lon1']
del df_cleaned['lon_1']
del df_cleaned['totalHours']
# Calculatying the new distance and the new speed after remocing the actual noise points
df_cleaned['ChecksubVID']=df_cleaned.subVoyageIDs_subV != df_cleaned.subVoyageIDs_subV.shift(1)
df_cleaned['latrad']= np.where(df_cleaned['ChecksubVID'] != True, df_cleaned.lat.shift(1).apply(radians), np.nan)
df_cleaned['lonrad']= np.where(df_cleaned['ChecksubVID'] != True, df_cleaned.lon.shift(1).apply(radians), np.nan)
lat1 = df_cleaned['lat'].apply(radians)
lon1 = df_cleaned['lon'].apply(radians)
dlon = df_cleaned['lonrad'] - lon1
dlat = df_cleaned['latrad'] - lat1
harvesineDist = (dlat/2).apply(sin)**2 + lat1.apply(cos) * df_cleaned['latrad'].apply(cos) * (dlon/2).apply(sin)**2
c = 2* np.arcsin(harvesineDist.apply(sqrt))
dist = 6367 * c
df_cleaned['newdist'] = dist
# totalHours = df_cleaned.DateTime.diff().dt.seconds/3600
df_cleaned['totalHours'] =np.where(df_cleaned['ChecksubVID'] != True, df_cleaned.DateTime.diff().dt.seconds/3600, np.nan)
df_cleaned['newspeed']= df_cleaned['newdist'] / df_cleaned['totalHours']
del df_cleaned['latrad']
del df_cleaned['lonrad']
del df_cleaned['ChecksubVID']
del df_cleaned['totalHours']
# # breaking trajectories based on new distance and the new speed, hwere it seems having the large gaps of speed and distance
# df_cleaned['dist_groups']= df_cleaned.groupby('subVoyageIDs_subV').apply(lambda x: ((x.newdist > 10) & (x.newspeed > 50))).values
# df_cleaned['dist_ids']= (df_cleaned.groupby('subVoyageIDs_subV').apply(lambda x:x.dist_groups.shift(1)) != df_cleaned.groupby('subVoyageIDs_subV').apply(lambda x:x.dist_groups) | (df_cleaned.groupby('subVoyageIDs_subV').apply(lambda x: ((x.newdist > 10) & (x.newspeed > 100) )).values)).astype(int).cumsum().values
df_cleaned['dist_groups']= df_cleaned.groupby('subVoyageIDs_subV').apply(lambda x: ((x.newdist > 10) & (x.newspeed > 50))).values
df_cleaned['dist_ids']= ((df_cleaned.groupby('subVoyageIDs_subV').apply(lambda x:x.dist_groups.shift(1)) != (df_cleaned.groupby('subVoyageIDs_subV').apply(lambda x:x.dist_groups))) | (df_cleaned.groupby('subVoyageIDs_subV').apply(lambda x: ((x.newdist > 10) & (x.newspeed > 100) )).values)).astype(int).cumsum().values
totalPoints= df_cleaned.groupby('dist_ids').agg({'Speed':'count'}).reset_index().rename(columns={'Speed':'totalPoints'})
totalPoints= totalPoints[totalPoints.totalPoints > 10]
cleanedData= df_cleaned[df_cleaned.dist_ids.isin(totalPoints.dist_ids.unique())]
uniqueRawIDs = rawData.subVoyageIDs_subV.unique()
# Creating geometery from the Raw Data
geos = []
for idx, number in enumerate(uniqueRawIDs):
subV = rawData[rawData['subVoyageIDs_subV'] == number]
geoRawData = [[lon,lat] for lon,lat in zip(subV.lon ,subV.lat)]
poly = {
"type": "Feature",
"properties":{"sample": len(subV.subVoyageIDs_subV), 'id': float(subV.subVoyageIDs_subV.unique())},
"geometry" : {
"type" : "LineString",
"coordinates" : geoRawData,
}
}
geos.append(poly)
geometry = FeatureCollection(geos)
# geometry
json.dump(geometry, open("Zone18_2011_02_rawGeometery.geojson","w"))
uniqueCleanedIDs = cleanedData.dist_ids.unique()
# Creating geometery from the Raw Data
geos = []
for idx, number in enumerate(uniqueCleanedIDs):
subV = cleanedData[cleanedData['dist_ids'] == number]
geoCleanedData = [[lon,lat] for lon,lat in zip(subV.lon ,subV.lat)]
poly = {
"type": "Feature",
"properties":{"sample": len(subV.subVoyageIDs_subV), 'id': float(subV.subVoyageIDs_subV.unique())},
"geometry" : {
"type" : "LineString",
"coordinates" : geoCleanedData,
}
}
geos.append(poly)
geometry = FeatureCollection(geos)
# geometry
json.dump(geometry, open("Zone18_2011_02_cleanedGeometery.geojson","w"))
# creating the geometery from thr rawData
uniqueRawIDs = dfdata.subVoyageIDs_subV.unique()
geos = []
for idx, number in enumerate(uniqueRawIDs):
subV = dfdata[dfdata['subVoyageIDs_subV'] == number]
geoLDISTData = [[lon,lat] for lon,lat in zip(subV.lon ,subV.lat)]
poly = {
"type": "Feature",
"properties":{"sample": len(subV.subVoyageIDs_subV), 'id': float(subV.subVoyageIDs_subV.unique())},
"geometry" : {
"type" : "LineString",
"coordinates" : geoLDISTData,
}
}
geos.append(poly)
geometry = FeatureCollection(geos)
json.dump(geometry, open("Zone18_2011_02_rawGeometery_dfdata.geojson","w"))
# creating the geometery from thr rawData
geos = []
for idx, number in enumerate(bbUnique):
subV = boundingBoxData[boundingBoxData['subVoyageIDs_subV'] == number]
geoLDISTData = [[lon,lat] for lon,lat in zip(subV.lon ,subV.lat)]
poly = {
"type": "Feature",
"properties":{"sample": len(subV.subVoyageIDs_subV), 'id': float(subV.subVoyageIDs_subV.unique())},
"geometry" : {
"type" : "LineString",
"coordinates" : geoLDISTData,
}
}
geos.append(poly)
geometry = FeatureCollection(geos)
json.dump(geometry, open("Zone18_2011_02_boundingboxGeometery.geojson","w"))
d = {'Vessels': [len(df_CenteredWin.MMSI.unique())], 'Trajectories': len(df_CenteredWin.VoyageID.unique()), 'subTrajectories': len(df_CenteredWin.subVoyageIDs.unique()), 'subTrajectories Broke on Time': len(df_data.subVoyageIDs_subV.unique()), 'Short SubTrajectories or bad Data': len(boundingBoxSubV.subVoyageIDs_subV.unique()),
'Remaining Data after': len(dfdata.subVoyageIDs_subV.unique()), 'Raw Trajectories': len(rawData.subVoyageIDs_subV.unique()),
'Cleaned Trajectories': len(cleanedData.subVoyageIDs_subV.unique()), 'Noise Points' : len(totalNoise)}
staticsData = pd.DataFrame(data=d)
display(staticsData)
staticsDataTable= staticsData.stack().reset_index(name = "TotalData").rename(columns={'level_1': 'label'})
del staticsDataTable['level_0']
print(staticsDataTable)
ax = staticsDataTable.plot.bar(x='label', y='TotalData')
ax.tick_params(axis='x', labelrotation=90)
plt.show()
subV=dfdata[dfdata['subVoyageIDs_subV'] == 30002]
subVnoise= subV[subV.zScore_Noise == 'noise']
plt.plot(subV.Speed, '-go')
plt.show()
plt.plot(subV.distance, '-go')
plt.show()
plt.plot(subV.lon, subV.lat, '-go')
plt.plot(subVnoise.lon, subVnoise.lat, 'ro')
plt.show()
fig, ax = plt.subplots(figsize=(16,10))
ax.scatter(totalNoise.angle, totalNoise.distance)
ax.set_xlabel('angle')
ax.set_ylabel('distance')
plt.show()
dataIDs = cleanedData.subVoyageIDs.unique()
# Creating geometery from the Raw Data
geos = []
for idx, number in enumerate(dataIDs):
subV = cleanedData[cleanedData['subVoyageIDs'] == number]
geoCleanedData = [[lon,lat] for lon,lat in zip(subV.lon ,subV.lat)]
poly = {
"type": "Feature",
"properties":{"sample": len(subV.subVoyageIDs), 'id ': float(subV.subVoyageIDs.unique())},
"geometry" : {
"type" : "LineString",
"coordinates" : geoCleanedData,
}
}
geos.append(poly)
geometry = FeatureCollection(geos)
# geometry
json.dump(geometry, open("/bigdata0/rshankar/geofiles/cleanedGeoData/Zone18_02_geometery.geojson","w"))
noiseIDs = rawData.subVoyageIDs.unique()
# Creating geometery from the Raw Data
geos = []
for idx, number in enumerate(noiseIDs):
subV = rawData[rawData['subVoyageIDs'] == number]
geoNoiseData = [[lon,lat] for lon,lat in zip(subV.lon ,subV.lat)]
poly = {
"type": "Feature",
"properties":{"sample": len(subV.subVoyageIDs), 'id ': float(subV.subVoyageIDs.unique())},
"geometry" : {
"type" : "LineString",
"coordinates" : geoNoiseData,
}
}
geos.append(poly)
geometry = FeatureCollection(geos)
# geometry
json.dump(geometry, open("/bigdata0/rshankar/geofiles/rawGeoData/Zone18_02_rawGeometery.geojson","w"))
largeDistCleanedData = largeDistData.subVoyageIDs.unique()
# Creating geometery from the Raw Data
geos = []
for idx, number in enumerate(largeDistCleanedData):
subV = largeDistData[largeDistData['subVoyageIDs'] == number]
geoLDData = [[lon,lat] for lon,lat in zip(subV.lon ,subV.lat)]
poly = {
"type": "Feature",
"properties":{"sample": len(subV.subVoyageIDs), 'id ': float(subV.subVoyageIDs.unique())},
"geometry" : {
"type" : "LineString",
"coordinates" : geoLDData,
}
}
geos.append(poly)
geometry = FeatureCollection(geos)
# geometry
json.dump(geometry, open("/bigdata0/rshankar/geofiles/cleanedGeoData/Zone18_02_LDistGeometery.geojson","w"))
# map data to show rawdata, then noise data and circles to point out the noise points.
rawData = os.path.join('/bigdata0/rshankar/geofiles/rawGeoData', 'Zone18_02_rawGeometery.geojson')
CleanedData = os.path.join('/bigdata0/rshankar/geofiles/cleanedGeoData', 'Zone18_02_geometery.geojson')
# CleanedLDistData = os.path.join('/bigdata0/rshankar/geofiles/cleanedGeoData', 'Zone18_02_LDistGeometery.geojson')
def style_function(feature):
return {
'fillOpacity': 0.2,
'weight': 2,
'color': 'red'
}
def style_functionA(feature):
return {
'fillOpacity': 0.2,
'weight': 2,
'color': 'Yellow'
}
m = folium.Map(
location=[-59.1759, -11.6016], tiles='stamenterrain', zoom_start=5
)
folium.TileLayer('openstreetmap').add_to(m)
folium.TileLayer('StamenToner').add_to(m)
folium.TileLayer('stamenterrain').add_to(m)
folium.TileLayer('Mapbox Control Room').add_to(m)
folium.GeoJson(
rawData,
name='rawData',
style_function= style_functionA
).add_to(m)
folium.GeoJson(
CleanedData,
name='CleanedData',
style_function= style_function
).add_to(m)
# datacord = list(zip(noiseData_angle.lat, noiseData_angle.lon))
# for datacordlatlon in datacord:
# folium.CircleMarker(datacordlatlon, radius=10, color='green').add_to(m)
folium.LayerControl().add_to(m)
m.save('geometry_zone18_02.html')
angleUnique = dataNoiseRecord_distAngle.subVoyageIDs_subV.unique()
Angledata = dfdata[dfdata.subVoyageIDs_subV.isin(dataNoiseRecord_distAngle.subVoyageIDs_subV)]
Angledata
# subV = Angledata[Angledata['subVoyageIDs_subV'] == 26226]
# subV_Noise = dataNoiseRecord_distAngle[dataNoiseRecord_distAngle['subVoyageIDs_subV'] == 26226]
# plt.plot(subV.lon, subV.lat, '-go')
# plt.plot(subV_Noise.lon, subV_Noise.lat, 'ro')
# plt.show()
len(Angledata.subVoyageIDs_subV.unique())
for idx, ids in enumerate(angleUnique[50:100]):
subV = Angledata[Angledata['subVoyageIDs_subV'] == ids]
subV_noise = dataNoiseRecord_distAngle[dataNoiseRecord_distAngle['subVoyageIDs_subV'] == ids]
display(ids)
display(subV_noise.angle)
plt.plot(subV.lon, subV.lat, '--go')
plt.plot(subV_noise.lon, subV_noise.lat, 'r*')
plt.show()
subV = Angledata[Angledata['subVoyageIDs_subV'] == 8111]
coordinates = list(zip(subV.lat.tolist(), subV.lon.tolist()))
m = folium.Map(location=[41.8240, -71.4128])
folium.PolyLine(coordinates, color = "Black").add_to(m)
m
dataNoiseRecord_distance= dataNoiseRecord[dataNoiseRecord.distance > 1.4]
dataNoiseRecord_distAngle = dataNoiseRecord_distance[dataNoiseRecord_distance.angle.between(0, 90)]
len(dataNoiseRecord_distAngle.subVoyageIDs_subV.unique())
fig, ax = plt.subplots(figsize=(16,10))
ax.scatter(dataNoiseRecord_distAngle.angle, dataNoiseRecord_distAngle.distance)
ax.set_xlabel('angle')
ax.set_ylabel('distance')
plt.show()
angleUnique = dataNoiseRecord_distAngle.subVoyageIDs_subV.unique()
Angledata = dfdata[dfdata.subVoyageIDs_subV.isin(dataNoiseRecord_distAngle.subVoyageIDs_subV)]
for idx, ids in enumerate(angleUnique[0:50]):
subV = Angledata[Angledata['subVoyageIDs_subV'] == ids]
subV_noise = dataNoiseRecord_distAngle[dataNoiseRecord_distAngle['subVoyageIDs_subV'] == ids]
display(ids)
display(subV_noise.angle)
plt.plot(subV.lon, subV.lat, '--go')
plt.plot(subV_noise.lon, subV_noise.lat, 'r*')
plt.show()
subV = Angledata[Angledata['subVoyageIDs_subV'] == 1384]
coordinates = list(zip(subV.lat.tolist(), subV.lon.tolist()))
m = folium.Map(location=[41.8240, -71.4128])
folium.PolyLine(coordinates, color = "Black").add_to(m)
m